home *** CD-ROM | disk | FTP | other *** search
/ NeXT Education Software Sampler 1992 Fall / NeXT Education Software Sampler 1992 Fall.iso / Programming / Source / WAIS / ir / irverify.c < prev    next >
Encoding:
C/C++ Source or Header  |  1992-02-02  |  10.1 KB  |  363 lines

  1. #include "irverify.h"
  2. #include "irfiles.h"
  3. #include "panic.h"
  4. #include "futil.h"
  5.  
  6. #define TEST_READ false
  7.  
  8. /*---------------------------------------------------------------------------*/
  9.  
  10. void
  11. printIndex (db)
  12. database* db;
  13. /* iterate over the index printing the contents */
  14. {
  15.   serialPostingFile* spf = NULL;
  16.   char indexFileName[MAX_FILE_NAME_LEN + 1];
  17.   postingsForATerm* posts = NULL;
  18.   
  19.   spf = initSerialPostingFile(index_filename(indexFileName,db));
  20.   
  21.   while ((posts = getPostingsForNextTerm(spf)) != NULL)
  22.    { printPostingsForATerm(posts);
  23.      /* XXX dispose of them */
  24.    }
  25.    
  26.   disposeSerialPostingFile(spf);
  27. }
  28.  
  29. /*---------------------------------------------------------------------------*/
  30.  
  31. static void 
  32. print_dictionary_block_and_index _AP((unsigned char* block,long size,serialPostingFile* spf));
  33.  
  34. static void 
  35. print_dictionary_block_and_index(block,size,spf)
  36. unsigned char *block;
  37. long size;
  38. serialPostingFile* spf;
  39. /* this prints the contents of a dictionary block */
  40. {
  41.   long i;
  42.   postingsForATerm* posts = NULL;
  43.   
  44.   for(i = 0; i < size; i++)
  45.    {
  46.      char *word = dictionary_block_word(i, block);
  47.      long pos = dictionary_block_position(i, block);
  48.      if(word[0] == '\0')
  49.        break;
  50.      printf("Entry %3ld: %21s %7ld\n", i, word,pos);
  51.      posts = getPostingsAt(spf,pos);
  52.      printPostingsForATerm(posts);
  53.      /* XXX dispose of them postings */
  54.    }
  55. }
  56.  
  57. /*---------------------------------------------------------------------------*/
  58.  
  59. extern long number_of_dictionary_blocks;
  60. extern unsigned char *dictionary_header_block;
  61. extern unsigned char *dictionary_block;
  62.  
  63. void
  64. printIndexUsingDictionary(db)
  65. database* db;
  66. /* use the dictionary to go over the index */
  67. {
  68.   /* prints the contents of a dictionary */
  69.   FILE *dictStream = db->dictionary_stream;
  70.   long i;
  71.   long new_number_of_dictionary_blocks;
  72.   serialPostingFile* spf = NULL;
  73.   char indexFileName[MAX_FILE_NAME_LEN + 1];
  74.  
  75.   spf = initSerialPostingFile(index_filename(indexFileName,db));
  76.  
  77.   if(NULL == dictStream)
  78.     panic("dictionary dictStream is not open");
  79.   s_fseek(dictStream, 0L, SEEK_SET);
  80.   new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, dictStream);
  81.   if(new_number_of_dictionary_blocks > number_of_dictionary_blocks)
  82.     dictionary_header_block = NULL;
  83.   number_of_dictionary_blocks = new_number_of_dictionary_blocks;
  84.   printf("Number of dictionary blocks %ld\n", number_of_dictionary_blocks);
  85.   if(NULL == (dictionary_header_block =
  86.           read_dictionary_block(dictionary_header_block,
  87.                     DICTIONARY_HEADER_SIZE,
  88.                     number_of_dictionary_blocks,
  89.                     dictStream)))
  90.     panic("Could not read dictionary header block");
  91.   printf("The Dictionary Header Block:\n");
  92.   print_dictionary_block(dictionary_header_block, number_of_dictionary_blocks);
  93.   for(i = 0; i < number_of_dictionary_blocks; i++)
  94.   {
  95.     long pos = dictionary_block_position(i, dictionary_header_block);
  96.     if(NULL == (dictionary_block =
  97.         read_dictionary_block(dictionary_block,
  98.                       pos, DICTIONARY_BLOCK_SIZE, dictStream)))
  99.       panic("Could not read dictionary block %ld", pos);
  100.     printf("\n\nDictionary block %ld (position %ld):\n", i, pos);
  101.     print_dictionary_block_and_index(dictionary_block, DICTIONARY_BLOCK_SIZE,spf);
  102.   }
  103.   fseek(dictStream, 0L, SEEK_END);
  104.   disposeSerialPostingFile(spf);
  105. }
  106.  
  107. /*---------------------------------------------------------------------------*/
  108.  
  109. serialPostingFile*
  110. initSerialPostingFile(filename)
  111. char* filename;
  112. /* open an inverted index file create by irn8. return a structure
  113.    maintaining its state
  114.  */
  115. {
  116.   FILE* stream = NULL;
  117.   serialPostingFile* pf = NULL;
  118.  
  119.   stream = s_fopen(filename,"rb");
  120.   if (stream == NULL) /* can't open that file */
  121.     return(NULL);
  122.   s_fseek(stream,INDEX_HEADER_SIZE,SEEK_SET);
  123.  
  124.   pf = (serialPostingFile*)s_malloc((size_t)sizeof(serialPostingFile));
  125.   pf->stream = stream;
  126.   pf->length = file_length(stream);
  127.   pf->current_index_block = INDEX_HEADER_SIZE;
  128.  
  129.   return(pf);
  130. }
  131.  
  132. /*---------------------------------------------------------------------------*/
  133.  
  134. void
  135. disposeSerialPostingFile(pf)
  136. serialPostingFile* pf;
  137. {
  138.   s_fclose(pf->stream);
  139.   s_free(pf);
  140. }
  141.  
  142. /*---------------------------------------------------------------------------*/
  143.  
  144. void 
  145. printPostingsForATerm(pfat)
  146. postingsForATerm* pfat;
  147. {
  148.   long i;
  149.  
  150.   if (pfat->word[0] != '\0')
  151.     printf("word '%s'\n",pfat->word);
  152.     
  153.   for (i = 0; i < pfat->entries; i++)
  154.     printf("\tdoc %ld weight %ld\n",pfat->docs[i],pfat->weights[i]);
  155. }
  156.  
  157. /*---------------------------------------------------------------------------*/
  158.  
  159. postingsForATerm*
  160. getPostingsAt(spf,position)
  161. serialPostingFile* spf;
  162. long position;
  163. /* position better be a valid starting position! */
  164. {
  165.   fseek(spf->stream,position,SEEK_SET);
  166.   spf->current_index_block = position;
  167.   return(getPostingsForNextTerm(spf));
  168. }
  169.  
  170. /*---------------------------------------------------------------------------*/
  171.  
  172. postingsForATerm*
  173. getPostingsForNextTerm(spf)
  174. serialPostingFile* spf;
  175. {
  176.   postingsForATerm* posts = NULL;
  177.   boolean keepGoing = true;
  178.   
  179.   if (spf->current_index_block >= spf->length)
  180.     return(NULL);
  181.  
  182.   posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
  183.   posts->word[0] = '\0';
  184.   posts->entries = 0;
  185.  
  186.   while (keepGoing) 
  187.    { 
  188.      long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
  189.      long next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  190.      long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  191.  
  192.      if (flag == INDEX_BLOCK_DICTIONARY_FLAG)
  193.        { long last_index_block;
  194.      long index_block_size;
  195.      long number_of_occurances;
  196.      char word[MAX_WORD_LENGTH + 1];
  197.      if (0 > read_dictionary_index_block(spf->current_index_block,
  198.                          &last_index_block,
  199.                          &index_block_size,
  200.                          &number_of_occurances,
  201.                          word,
  202.                          spf->stream))
  203.        panic("read dictionary index block failed");
  204.      cprintf(TEST_READ,
  205.          "%ld: size %3ld word '%s',occurances %ld last block %ld\n",
  206.          spf->current_index_block,index_block_size,word,
  207.          number_of_occurances,next_index_block);
  208.      strcpy(posts->word,word);
  209.        }
  210.  
  211.      else if (flag == INDEX_BLOCK_NOT_FULL_FLAG)
  212.        { cprintf(TEST_READ,"%ld: size %3ld Not full,valid entries %ld\n",
  213.          spf->current_index_block,index_block_size,next_index_block);
  214.      readPostings(spf,posts);
  215.      keepGoing = false;
  216.        }
  217.  
  218.      else if (flag == INDEX_BLOCK_FULL_FLAG)
  219.        { cprintf(TEST_READ,"%ld: size %3ld full block,next block %ld\n",
  220.          spf->current_index_block,index_block_size,next_index_block);
  221.      readPostings(spf,posts);
  222.      keepGoing = false;
  223.        }
  224.  
  225.      else 
  226.        panic("bad entry %ld (ftell %ld),flag was %ld",
  227.          spf->current_index_block,ftell(spf->stream),flag);
  228.  
  229.      spf->current_index_block += index_block_size;
  230.      s_fseek(spf->stream,spf->current_index_block,SEEK_SET);
  231.    }
  232.  
  233.   return(posts);
  234. }
  235.  
  236. /*---------------------------------------------------------------------------*/
  237.  
  238. void
  239. disposePostingsForATerm(pfat)
  240. postingsForATerm* pfat;
  241. {
  242.   s_free(pfat->docs);
  243.   s_free(pfat->weights);
  244.   s_free(pfat);
  245. }
  246.  
  247. /*---------------------------------------------------------------------------*/
  248.  
  249. void 
  250. removePostings(pfat,start,run)
  251. postingsForATerm* pfat;
  252. long start;
  253. long run;
  254. /* remove postings start through start + run from the pfat */
  255. {
  256.   void* toPtr = NULL;
  257.   long runLen;
  258.   long toMove;
  259.  
  260.   if (start + run > pfat->entries)
  261.     return; /* this is an error */
  262.  
  263.   toPtr = (void*)(pfat->docs + (start * sizeof(docID)));
  264.   runLen = run * sizeof(docID);
  265.   toMove = ((pfat->entries - start) * sizeof(docID)) - runLen;
  266.   memmove(toPtr,toPtr + runLen,toMove);
  267.  
  268.   toPtr = (void*)(pfat->weights + (start * sizeof(postingWeight)));
  269.   runLen = run * sizeof(docID);
  270.   toMove = ((pfat->entries - start) * sizeof(postingWeight)) - runLen;
  271.   memmove(toPtr,toPtr + runLen,toMove);
  272.  
  273.   pfat->entries -= run;
  274. }
  275.  
  276. /*---------------------------------------------------------------------------*/
  277.  
  278. void
  279. readPostings(spf,posts)
  280. serialPostingFile* spf;
  281. postingsForATerm* posts;
  282. {
  283.   long not_full_flag = INDEX_BLOCK_FULL_FLAG;
  284.   long count,index_block_size;
  285.   long document_id,weight,number_of_valid_entries;
  286.   long index_block = spf->current_index_block;
  287.   
  288.   if (index_block >= 0)
  289.     {
  290.       /* read the index block */
  291.       if (0 != fseek(spf->stream,(long)index_block,SEEK_SET))
  292.     { 
  293.       fprintf(stderr,
  294.           "fseek failed into the inverted file to position %ld\n",
  295.           (long)index_block); 
  296.       return;
  297.     }
  298.       
  299.       not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
  300.       index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
  301.       index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
  302.       if (EOF == index_block_size) 
  303.     { fprintf(stderr,"reading from the index file failed\n");
  304.       return;
  305.     }
  306.       
  307.       if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG)
  308.     { /* not full */
  309.       number_of_valid_entries = index_block;
  310.     }
  311.       else if (not_full_flag == INDEX_BLOCK_FULL_FLAG)
  312.     { /* full */
  313.       number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
  314.     }
  315.       else
  316.     { /* bad news,file is corrupted.  this should return error
  317.          code rather than panicing XXX */
  318.     panic("Expected the flag in the inverted file to be valid.  it is %ld",
  319.           not_full_flag);
  320.         }
  321.  
  322.       cprintf(TEST_READ,"  number of valid bytes: %ld\n",
  323.           number_of_valid_entries);
  324.       
  325.       for (count = 0; count < number_of_valid_entries; 
  326.        count = count + INDEX_ELEMENT_SIZE)
  327.     {
  328.       document_id = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
  329.       (void)read_bytes(WORD_POSITION_SIZE,spf->stream);
  330.       (void)read_bytes(CHARACTER_POSITION_SIZE,spf->stream);
  331.       weight = read_bytes(WEIGHT_SIZE,spf->stream);
  332.       cprintf(TEST_READ,"    entry %ld,Doc_id: %ld,weight %ld\n",
  333.           count % INDEX_ELEMENT_SIZE,document_id,weight);
  334.       if(EOF == weight) 
  335.         { fprintf(stderr,"reading from the doc-id table failed\n");
  336.           return;
  337.         }
  338.       posts->entries++;
  339.       posts->docs = (docID*)s_realloc(posts->docs,
  340.                   (size_t)(sizeof(docID) * posts->entries));
  341.       posts->docs[posts->entries - 1] = document_id;
  342.       posts->weights = (postingWeight*)s_realloc(posts->weights,
  343.                      (size_t)(sizeof(postingWeight) * 
  344.                           posts->entries));
  345.       posts->weights[posts->entries - 1] = weight;
  346.     }
  347.     }
  348. }
  349.  
  350. /*---------------------------------------------------------------------------*/
  351.  
  352.  
  353.  
  354.  
  355.  
  356.  
  357.  
  358.  
  359.  
  360.  
  361.  
  362.  
  363.